• PCA looks to find a low-dimensional representation of the observations that explain a good fraction of the variance; • Clustering looks to find homogeneous subgroups among the observations.
K-means clustering is a simple and elegant approach for partitioning a data set into K distinct, non-overlapping clusters. To perform K-means clustering, we must first specify the desired number of clusters K; then the K-means algorithm will assign each observation to exactly one of the K clusters. Figure 10.5 shows the results obtained from performing K-means clustering on a simulated example consisting of 150 observations in two dimensions, using three different values of K.
source("setup.R")
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 3.5.3
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 3.5.3
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 3.5.3
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: ggmosaic
## Warning: package 'ggmosaic' was built under R version 3.5.3
## Loading required package: corrgram
## Warning: package 'corrgram' was built under R version 3.5.3
## Loading required package: party
## Warning: package 'party' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.5.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.5.2
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 1.4.2 v purrr 0.2.5
## v tidyr 0.8.1 v dplyr 0.7.6
## v readr 1.1.1 v stringr 1.3.1
## v tibble 1.4.2 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x stringr::boundary() masks strucchange::boundary()
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.5.2
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## Using Class, record_id as id variables
t3.data.ssq.base <- filter(t3.data.ssq, redcap_event_name == "baseline_arm_1")
t3.data.ssq.base$ssq12_dk_complete <- NULL
t3.data.ssq.base$ssq_space_mean <- as.integer(t3.data.ssq.base$ssq_space_mean)
t3.data.ssq.base$ssq_speech_mean <- as.integer(t3.data.ssq.base$ssq_speech_mean)
t3.data.ssq.base$ssq_sound_mean <- as.integer(t3.data.ssq.base$ssq_sound_mean)
t3.data.ssq.base <- na.omit(t3.data.ssq.base)
t3.data.ssq.diff <- t3.data.ssq %>% filter(redcap_event_name == "baseline_arm_1") %>% select(record_id, IsDrawerUser, IsT1DrawerUser) %>% merge(ssq_diff, by="record_id")
set.seed(20)
clusters <- kmeans(t3.data.ssq.base[,15:17], 3)
# Save the cluster number in the dataset as column 'Borough'
t3.data.ssq.base$Cluster <- as.factor(clusters$cluster)
# Sorting
sorted <- t3.data.ssq.base %>% group_by(Cluster) %>% summarise(mean = mean(c(ssq_speech_mean, ssq_space_mean, ssq_sound_mean)), n = n())
sorted_borough <- sorted[order(sorted$mean),]$Cluster
t3.data.ssq.base$Cluster <- factor(t3.data.ssq.base$Cluster, levels = sorted_borough, ordered = TRUE)
str(clusters)
## List of 9
## $ cluster : Named int [1:1298] 1 1 1 2 2 1 1 1 1 3 ...
## ..- attr(*, "names")= chr [1:1298] "2" "3" "4" "6" ...
## $ centers : num [1:3, 1:3] 34.3 19 63.2 60.5 26.1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:3] "ssq_speech_mean" "ssq_space_mean" "ssq_sound_mean"
## $ totss : num 1848568
## $ withinss : num [1:3] 335939 151511 179622
## $ tot.withinss: num 667072
## $ betweenss : num 1181496
## $ size : int [1:3] 647 293 358
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
# Writing
t3.data.ssq.base %>% group_by(Cluster) %>%
summarise(speech = mean(ssq_speech_mean), sound = mean(ssq_sound_mean), space = mean(ssq_space_mean) , n = n())
#plotting
ggplot(t3.data.ssq.base) +
geom_point(aes(x=ssq_speech_mean, y=ssq_space_mean, color=Cluster))
ggplot(t3.data.ssq.base) +
geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
scale_fill_hue(l=60, c=60)
t3.data.ssq.base <- merge(t3.data.ssq.base, df_audiogram, by=c('record_id'))
ggplot(t3.data.ssq.base) +
geom_bar(aes(x=Cluster, fill=Class), position='fill', color="black") +
scale_fill_hue(l=80, c=80)
# Elbow method
k.max <- 15
data <- na.omit(t3.data.ssq.base[,15:17])
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
## [1] 3697136.2 1904148.9 1334143.5 1101252.3 946272.6 829610.7 750970.5
## [8] 694473.6 643674.8 601272.0 563326.3 533142.5 503068.6 477330.0
## [15] 457858.6
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
Makes everything lag but very 3 dimensionally
plot_ly(t3.data.ssq.base, x = ~ssq_space_mean, y = ~ssq_speech_mean, z = ~ssq_sound_mean, color = ~Cluster) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'space'),
yaxis = list(title = 'speech'),
zaxis = list(title = 'sound')))
t3.data.ssq.diff <- t3.data.ssq %>% filter(redcap_event_name == "baseline_arm_1") %>% select(record_id, IsDrawerUser, IsT1DrawerUser) %>% merge(ssq_diff, by="record_id")
t3.data.ssq.diff <- t3.data.ssq.diff %>% filter(!is.na(ssq_speech_mean) & !is.na(ssq_space_mean) & !is.na(ssq_sound_mean))
set.seed(20)
clusters <- kmeans(t3.data.ssq.diff[,18:20], 5)
# Save the cluster number in the dataset as column 'Borough'
t3.data.ssq.diff$Cluster <- as.factor(clusters$cluster)
# Sorting
sorted <- t3.data.ssq.diff %>% group_by(Cluster) %>% summarise(mean = mean(c(ssq_speech_mean, ssq_space_mean, ssq_sound_mean)), n = n())
sorted_borough <- sorted[order(sorted$mean),]$Cluster
t3.data.ssq.diff$Cluster <- factor(t3.data.ssq.diff$Cluster, levels = sorted_borough, ordered = TRUE)
str(clusters)
## List of 9
## $ cluster : int [1:1685] 5 5 1 5 1 5 3 2 5 4 ...
## $ centers : num [1:5, 1:3] -7.57 14.27 49.07 33.97 2.76 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : chr [1:3] "ssq_speech_mean" "ssq_space_mean" "ssq_sound_mean"
## $ totss : num 1911946
## $ withinss : num [1:5] 113325 122724 117586 122586 189591
## $ tot.withinss: num 665812
## $ betweenss : num 1246134
## $ size : int [1:5] 182 319 151 275 758
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
# Writing
t3.data.ssq.diff %>% group_by(Cluster) %>%
summarise(speech = mean(ssq_speech_mean), sound = mean(ssq_sound_mean), space = mean(ssq_space_mean) , n = n())
#plotting
ggplot(t3.data.ssq.diff) +
geom_point(aes(x=ssq_speech_mean, y=ssq_space_mean, color=Cluster))
ggplot(t3.data.ssq.diff) +
geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
scale_fill_hue(l=60, c=60)
t3.data.ssq.diff <- merge(t3.data.ssq.diff, df_audiogram, by=c('record_id'))
ggplot(t3.data.ssq.diff) +
geom_bar(aes(x=Cluster, fill=Class), position='fill', color="black") +
scale_fill_hue(l=80, c=80)
# Elbow method
k.max <- 15
data <- na.omit(t3.data.ssq.diff[,18:20])
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
## [1] 3822060.5 2150212.5 1720055.8 1496068.4 1329222.6 1194539.3 1089503.7
## [8] 1019190.7 954276.5 892790.4 840477.5 792655.8 751667.4 718613.9
## [15] 687978.9
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
plot_ly(t3.data.ssq.diff, x = ~ssq_space_mean, y = ~ssq_speech_mean, z = ~ssq_sound_mean, color = ~Cluster) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'space'),
yaxis = list(title = 'speech'),
zaxis = list(title = 'sound')))
ggplot(t3.data.ssq.diff) +
geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
scale_fill_hue(l=60, c=60)
t3.data.ssq.diff
set.seed(20)
#elbow
k.max <- 15
data <- na.omit(df_motivation[,5:6])
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
## [1] 248931.48 113535.94 80748.37 60625.13 49358.53 39878.65 31730.11
## [8] 24878.51 21389.23 19099.97 16941.48 15498.69 14363.73 12497.27
## [15] 11626.96
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
clusters <- kmeans(na.omit(df_motivation[,5:6]), 4)
# Save the cluster number in the dataset as column 'Borough'
df_motivation.k <- na.omit(df_motivation)
df_motivation.k$Cluster <- as.factor(clusters$cluster)
str(clusters)
## List of 9
## $ cluster : Named int [1:504] 4 4 4 4 4 4 3 4 4 1 ...
## ..- attr(*, "names")= chr [1:504] "3" "4" "6" "8" ...
## $ centers : num [1:4, 1:2] 75 43.9 91.2 96.9 84.3 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "1" "2" "3" "4"
## .. ..$ : chr [1:2] "motivation_line_1_ver2" "motivation_line_2_ver2"
## $ totss : num 248931
## $ withinss : num [1:4] 20066 16644 10286 13671
## $ tot.withinss: num 60666
## $ betweenss : num 188266
## $ size : int [1:4] 106 28 38 332
## $ iter : int 4
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
ggplot(df_motivation.k) +
geom_bin2d(aes(x=motivation_line_1_ver2, y=motivation_line_2_ver2, color=Cluster))
-```{r} t3.ssq.base.pca <- prcomp(t3.data.ssq.base[,c(3:14)], center = TRUE,scale. = TRUE) summary(t3.ssq.base.pca) plot(t3.ssq.base.pca)
#elbow k.max <- 15 data <- na.omit(t3.ssq.base.pca$x[,1:3])
wss <- sapply(1:k.max, function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss}) wss
plot(1:k.max, wss, type=“b”, pch = 19, frame = FALSE, xlab=“Number of clusters K”, ylab=“Total within-clusters sum of squares”)
clusters <- kmeans(na.omit(t3.ssq.base.pca$x[,1:3]), 3)
bla <- cbind(na.omit(t3.data.ssq.base[,3:14]), t3.ssq.base.pca$rotation) ```